%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
plt.rc('font', size=10) # controls default text sizes
plt.rc('axes', titlesize=30) # fontsize of the axes title
# gamma distribution log-likelihood
def log_likelihood(a, b, X):
return len(X) * (a * math.log(b) - math.log(math.gamma(a))) + \
(a-1) * np.sum(np.log(X)) - b * np.sum(X)
def visualize_log_likelihood(ax, a_true, b_true, data):
X = np.arange(0.2, 5.0, 0.2)
Y = np.arange(0.2, 5.0, 0.2)
X_, Y_ = np.meshgrid(X, Y)
Z_ = np.zeros(X_.shape)
tmp = float("-inf")
i_m, j_m = 0, 0
for i in range(Z_.shape[0]):
for j in range(Z_.shape[1]):
Z_[i, j] = log_likelihood(X_[i, j], Y_[i, j], data)
if tmp <= Z_[i, j]:
i_m, j_m = i, j
tmp = Z_[i, j]
ax.plot_surface(X_, Y_, Z_, cmap='viridis', edgecolor='black', alpha=0.4)
ax.scatter(a_true, b_true, log_likelihood(a_true, b_true, data), marker="o", color="red", s=500)
ax.scatter(X_[i_m, j_m], Y_[i_m, j_m], log_likelihood(X_[i_m, j_m], Y_[i_m, j_m], data), marker="^", color="blue", s=500)
ax.set_title(f"a={a_true:.2f}, b={b_true:.2f}, N={len(data):,}");
ax.set_xlabel("a")
ax.set_ylabel("b")
ax.set_zlabel("log-l")
ax.xaxis.label.set_fontsize(35)
ax.yaxis.label.set_fontsize(35)
ax.zaxis.label.set_fontsize(35)
ax.set_zticks([])
return ax
a_true = 2
b_true = 2
scale = 1 / b_true
np.random.seed(0)
X = np.random.gamma(shape=a_true, scale=scale, size=100)
log_likelihood(a_true, b_true, X)
-93.10795892181588
nrows = 3
ncols = 4
a_true = 2
b_true = 2
scale = 1 / b_true
fig, ax = plt.subplots(nrows, ncols, figsize=(30,25), subplot_kw=dict(projection='3d'), squeeze=False)
fig.suptitle("4 plots for different N with 3 views. Red spot is the true likelihood and Blue triangle is the Maxima", fontsize=35, y=0.01)
np.random.seed(0)
for i in range(ncols):
N = 10 ** (i+2)
X = np.random.gamma(shape=a_true, scale=scale, size=N)
true_l = log_likelihood(a_true, b_true, X)
ax[0,i] = visualize_log_likelihood(ax[0,i], a_true, b_true, X)
ax[0,i].view_init(20, 100)
ax[1,i] = visualize_log_likelihood(ax[1,i], a_true, b_true, X)
ax[1,i].view_init(100, 270)
ax[2,i] = visualize_log_likelihood(ax[2,i], a_true, b_true, X)
ax[2,i].set_xlim(a_true - 2, a_true + 2)
ax[2,i].set_ylim(b_true - 2, b_true + 2)
ax[2,i].set_zlim(true_l - int(0.3 * N), true_l + int(0.3 * N))
ax[2,i].view_init(0, 100)
fig.tight_layout(pad=1)
plt.show()
nrows = 3
ncols = 4
fig, ax = plt.subplots(nrows, ncols, figsize=(30,25), subplot_kw=dict(projection='3d'), squeeze=False)
fig.suptitle("4 plots for different N with 3 views. Red spot is the true likelihood and Blue triangle is the Maxima", fontsize=35, y=0.01)
np.random.seed(0)
for i in range(ncols):
N = 10 ** (i + 2)
a_true = np.random.uniform(0.5, 3)
b_true = np.random.uniform(0.5, 3)
scale = 1 / b_true
X = np.random.gamma(shape=a_true, scale=scale, size=N)
true_l = log_likelihood(a_true, b_true, X)
ax[0,i] = visualize_log_likelihood(ax[0,i], a_true, b_true, X)
ax[0,i].view_init(20, 270)
ax[1,i] = visualize_log_likelihood(ax[1,i], a_true, b_true, X)
ax[1,i].view_init(100, 270)
ax[2,i] = visualize_log_likelihood(ax[2,i], a_true, b_true, X)
ax[2,i].set_xlim(a_true - 2, a_true + 2)
ax[2,i].set_ylim(b_true - 2, b_true + 2)
ax[2,i].set_zlim(true_l - int(0.3 * N), true_l + int(0.3 * N))
ax[2,i].view_init(0, 100)
fig.tight_layout(pad=1)
plt.show()
It is a concave function that decreases as we go away from the true parameters. The yellow region represents the convergence near the maximum likelihood.
There is only one critical point represented by the blue triangle.
The maximum is located at $(\hat{a}, \hat{b})$ where $\hat{b} \approx b, \hat{a} \approx a$. The log-likelihood maxima is very close to the true parameters when N gets larger.
The first-order derivative of $\log \Gamma(a)$ is a digamma function. We can represent this as $(\log \Gamma(a))'$ Therefore,
$$\frac{\partial \ell(a,b)}{\partial a} = n\left(\log b - (\log \Gamma(a))'\right) + \sum_{i=1}^{n} \log x_i$$Therefore, $$\frac{\partial \ell(a,b)}{\partial b} = \frac{na}{b} - \sum_{i=1}^{n} x_i$$
where
$$ \Theta_{t} = \begin{pmatrix} a_t \\ b_t \end{pmatrix} $$The second-order derivative of $\log \Gamma(a)$ is a trigamma function. We can represent this as $(\log \Gamma(a))''$
Therefore hessian inverse is,
$$ [\mathbb{H}(a,b)]^{-1} = \frac{1}{n(1 - a(\log \Gamma(a))'')} \begin{pmatrix} a & b \\ b & b^2(\log \Gamma(a))'' \end{pmatrix} $$def update_newton(a, b, X):
from scipy.special import polygamma
def digamma(a):
return polygamma(0, a)
def trigamma(a):
return polygamma(1, a)
def H_inv(a, b, n):
return (1/(n * (1 - a * trigamma(a))) * np.array([
[a, b],
[b, (b**2) * trigamma(a)]
])
)
def delta_l_T(a, b, X, n):
return np.array([
[n * math.log(b) - n * digamma(a) + np.sum(np.log(X))],
[n*a/b - np.sum(X)]
])
n = len(X)
at_bt_T = np.array([[a], [b]])
res = np.subtract(at_bt_T, np.matmul(H_inv(a, b, n), delta_l_T(a, b, X, n)))
return res[:,0]
def MLE_gamma(a_init, b_init, X, tolerance=1e-3, max_iters=100):
err = 1.0
a, b = a_init, b_init
lik = log_likelihood(a, b, X)
i = 0
while (i < max_iters and err > tolerance):
a, b = update_newton(a, b, X)
if a < 0 or b < 0:
i = -1
break
lik_new = log_likelihood(a, b, X)
err = abs((lik_new - lik) / lik)
lik = lik_new
i += 1
return a, b, i
def get_str(run_id, a_init, b_init, a_new, b_new, iters):
return f"\n| {run_id} | {a_init:.2f} | {b_init:.2f} | {a_new:.2f} | {b_new:.2f} | {iters} |"
from IPython.display import display_markdown
n = 10
a_true = 2
b_true = 2
X = np.random.gamma(shape=a_true, scale=1/b_true, size=int(1e5))
delta = np.linspace(-1, 3, n)
display_markdown(f"### a_true = {a_true}, b_true = {b_true}", raw=True)
a_new, b_new, iters = MLE_gamma(a_true, b_true, X, tolerance=1e-3)
table = "| run | a_init | b_init | a_new | b_new | iterations |\n|---|---|---|---|---|---|"
table += get_str("trial", a_true, b_true, a_new, b_new, iters)
for i in range(n):
a, b = a_true + delta[i], b_true + delta[i]
a_new, b_new, iters = MLE_gamma(a, b, X)
table += get_str(i+1, a, b, a_new, b_new, iters)
display_markdown(table, raw=True)
| run | a_init | b_init | a_new | b_new | iterations |
|---|---|---|---|---|---|
| trial | 2.00 | 2.00 | 1.99 | 1.99 | 1 |
| 1 | 1.00 | 1.00 | 1.99 | 1.99 | 4 |
| 2 | 1.44 | 1.44 | 1.99 | 1.99 | 3 |
| 3 | 1.89 | 1.89 | 1.99 | 1.98 | 1 |
| 4 | 2.33 | 2.33 | 1.99 | 1.99 | 2 |
| 5 | 2.78 | 2.78 | 1.99 | 1.99 | 3 |
| 6 | 3.22 | 3.22 | 1.99 | 1.99 | 4 |
| 7 | 3.67 | 3.67 | 1.99 | 1.99 | 6 |
| 8 | 4.11 | 4.11 | -0.41 | -0.42 | -1 |
| 9 | 4.56 | 4.56 | -1.53 | -1.55 | -1 |
| 10 | 5.00 | 5.00 | -2.87 | -2.88 | -1 |
For the gamma distribution $X \sim Gamma(a, b)$, the first moment is given by $\mathbb{E}(X) = \frac{a}{b}$ and the second moment is given by $\mathbb{E}(X^2) = \frac{a(a+1)}{b^2}$
The first moment is $\bar{X} = \frac{1}{n} \sum_{i=1}^{n} x_i$ and second moment can is $\bar{X^2} = \frac{1}{n} \sum_{i=1}^{n} x_i^2$
$$\frac{a}{b} = \frac{1}{n} \sum_{i=1}^{n} x_i$$$$\frac{a(a+1)}{b^2} = \frac{1}{n} \sum_{i=1}^{n} x_i^2$$From equation 1 we can get $$a = b\bar{X}$$ Substituting this value in the second moment's equation implies that, $$\frac{b\bar{X}(b\bar{X}+1)}{b^2} = \frac{1}{n} \sum_{i=1}^{n} x_i^2$$
By solving this we get, $$\hat{b} = \frac{\bar{X}}{\frac{1}{n} \sum_{i=1}^{n} \left(x_i^2 - \bar{X}^2\right)}$$ Therefore for $\hat{a}$, $$\hat{a} = \hat{b}\bar{X}$$
$$\hat{a} = \frac{\bar{X}^2}{\frac{1}{n} \sum_{i=1}^{n} \left(x_i^2 - \bar{X}^2\right)}$$def MME_gamma(X):
def get_moment(X, n):
return np.array([x**n for x in X]).mean()
m1 = get_moment(X, 1)
m2 = get_moment(X, 2)
inv = 1 / (m2 - m1**2)
a = (m1**2) * inv
b = m1 * inv
return a, b
a_true = 2
b_true = 2
scale = 1 / b_true
X = np.random.gamma(shape=a_true, scale=scale, size=int(1e3))
a_hat, b_hat = MME_gamma(X)
a_hat2, b_hat2, iters = MLE_gamma(a_hat, b_hat, X, tolerance=1e-6)
display_markdown(f"#### a_true = {a_true}, b_true = {b_true}", raw=True)
display_markdown(f"#### MME Gamma: a = {a_hat:.2f}, b = {b_hat:.2f}", raw=True)
display_markdown(f"#### MLE Gamma: a = {a_hat2:.2f}, b = {b_hat2:.2f}, iterations = {iters}", raw=True)
Let $\hat\theta$ be an estimator of the random variables, given that we have observed the random variable $X$. The mean squared error (MSE) of this estimator against the true parameter $\theta$ is defined as
$$MSE(\hat\theta, \theta) = \mathbb{E}[||\hat\theta - \theta||^2] \\
= Var(\hat\theta) + Bias^2(\hat\theta, \theta)$$
The following code computes the MSE over large number of data sets $T = 1000$ of fixed sized $N [10^2, 10^5]$.
from scipy.stats import gamma
from tqdm import tqdm
iters = 0
T = 1000
support = []
mse = {
"mme": {"a": [], "b": []},
"mle": {"a": [], "b": []},
}
print(f"Running for {T} iterations per N samples")
for i in tqdm(range(2, 6)):
mme_stats = {"a": 0, "b": 0, "mse_a": [], "mse_b": []}
mle_stats = {"a": 0, "b": 0, "mse_a": [], "mse_b": []}
N = 10 ** i
support.append(N)
for _ in range(T):
a_true = np.random.uniform(2, 2.5)
b_true = np.random.uniform(2, 2.5)
X = np.random.gamma(shape=a_true, scale=1/b_true, size=N)
a_hat_mme, b_hat_mme = MME_gamma(X)
a_hat_mle, b_hat_mle, iters = MLE_gamma(1.5, 1.5, X)
if iters < 0:
print(f"negatives found stopping at {T}", a_true, b_true)
break
mme_stats["a"] += (a_true-a_hat_mme)**2
mme_stats["b"] += (b_true-b_hat_mme)**2
mle_stats["a"] += (a_true-a_hat_mle)**2
mle_stats["b"] += (b_true-b_hat_mle)**2
if iters < 0:
break
mse["mme"]["a"].append(mme_stats["a"]/T)
mse["mme"]["b"].append(mme_stats["b"]/T)
mse["mle"]["a"].append(mle_stats["a"]/T)
mse["mle"]["b"].append(mle_stats["b"]/T)
if iters >= 0: #negative not found
plt.rc('font', size=10) # controls default text sizes
plt.rc('axes', titlesize=10) # fontsize of the axes title
nrows = 1
ncols = 2
fig, ax = plt.subplots(nrows, ncols, figsize=(16,6))
for i, p in enumerate(["a", "b"]):
ax[i].set_title(f"Parameter {p}", fontsize=15)
ax[i].plot(support, mse["mme"][p])
ax[i].plot(support, mse["mle"][p])
ax[i].set_xscale("log")
ax[i].set_xlabel("N samples")
ax[i].set_ylabel("MSE")
ax[i].legend(["Moments", "Log-likelihood"], fontsize=12)
ax[i].xaxis.label.set_fontsize(15)
ax[i].yaxis.label.set_fontsize(15)
Running for 1000 iterations per N samples
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:24<00:00, 21.21s/it]
We can see from the graphs that $ MSE(\hat\theta, \theta) \to 0 $ as $ N \to \infty $
MLE_gamma is more accurate relative to MME_gamma.
We have random variables $$X \sim Poisson(\theta)$$
Therefore, Likelihood for $\theta$ will be
Consider known parameters $k = 1$, $\lambda=2.95$, $x = 10$
The posterior probability is $$\pi(\theta|X=x) \approx Gamma(a, b), a=k + x, b = \lambda + 1$$
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import math
np.seterr(all='raise')
# Given parameters
X = [10]
a = (1 + sum(X))
b = (2.95 + len(X))
# Posterior Gamma Likelihood * Prior
def posterior(theta):
return (theta ** (a - 1)) * math.exp(-b*theta)
# general metropolis-hastings algorithm
# the function returns the second half of the samples
def metropolis_hastings(x_init, posterior, N, positive_only=False):
# init support and its prob
x = x_init
p = posterior(x)
accepted = 0
s = 10
samples = []
for i in range(N):
xn = x + np.random.normal()
# xn = x * np.random.lognormal()
if positive_only:
xn = abs(xn)
pn = posterior(xn)
u = np.random.uniform(0, 1, 1)
alpha = min(pn/p, 1)
if alpha > u:
if x != xn:
accepted += 1
p = pn
x = xn
if i % s == 0:
samples.append(x)
burn_in = int(len(samples) * 0.5)
return samples[burn_in:], (accepted/N)
support = np.linspace(0.0, 10.0, 1000)
pdf = np.asarray([posterior(x) for x in support])
N = 10_00_000
lmbda_init = 1.5 #starting point
positives = True
samples, acceptance_rate = metropolis_hastings(lmbda_init, posterior, N, positives)
samples = np.array(samples)
plt.rc('font', size=10) # controls default text sizes
plt.rc('axes', titlesize=15) # fontsize of the axes title
nrows = 1
ncols = 2
fig, ax = plt.subplots(nrows, ncols, figsize=(16,6))
ax[0].plot(support, pdf)
ax[0].set_xlabel("X")
ax[0].set_ylabel("Density")
ax[0].set_title('Metropolis Hastings Posterior for Gamma')
ax[1].plot(support, pdf)
ax[1].hist(samples, bins=50, density=True, edgecolor="black", linewidth=0.1)
ax[1].legend(["Posterior", "Sampled posterior"])
ax[1].set_xlabel("X")
ax[1].set_ylabel("Density")
ax[1].set_title(f'MCMC simulation N={N:,} acceptance={acceptance_rate:.2f}')
for i in range(2):
ax[i].xaxis.label.set_fontsize(15)
ax[i].yaxis.label.set_fontsize(15)
Using Gaussian distribution for kernel density estimation for the above histogram and then calculating the Mean squared error of true density against the evaluated density.
Not applying any bandwidth as the fit is nearly the same.
from sklearn.metrics import mean_squared_error
from scipy.stats import gaussian_kde
kde = gaussian_kde(samples)
estimated = kde.evaluate(support)
mse = mean_squared_error(pdf, estimated)
plt.figure(figsize=(16,6))
plt.plot(support, pdf)
plt.plot(support, estimated, alpha=0.8)
plt.title(f"Mean Squared Error={mse:.3e}")
plt.legend(["truth", "estimated"])
plt.xlabel("X")
plt.ylabel("Density")
plt.show()